{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import logging\n",
    "import math\n",
    "import os\n",
    "import random\n",
    "from dataclasses import dataclass\n",
    "from glob import glob\n",
    "from itertools import chain\n",
    "from pathlib import Path\n",
    "from typing import Any, Dict, List, Optional, Tuple\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import torch\n",
    "from datasets import load_dataset\n",
    "from PIL import Image\n",
    "from torch.utils.data import Dataset\n",
    "from tqdm.auto import tqdm\n",
    "from transformers import AutoProcessor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "image_dir = \"data/jackyhate/text-to-image-2M/data_1024_10K\"\n",
    "\n",
    "\n",
    "def laod_data_from_dir(dir: str):\n",
    "    image_text_dir = Path(dir)\n",
    "    # image_file_list = list(image_text_dir.glob(\"*.jpg\"))\n",
    "    text_file_list = list(image_text_dir.glob(\"*.json\"))\n",
    "\n",
    "    def load_text(file_path: Path) -> Dict[str, Any]:\n",
    "        with open(file_path, \"r\") as f:\n",
    "            return json.load(f).get('prompt')\n",
    "        \n",
    "        \n",
    "    text_data_list = [{'prompt': load_text(file_path), 'file_path':file_path.name.replace(\".json\", \"\")} for file_path in tqdm(text_file_list)]\n",
    "    return text_data_list\n",
    "    \n",
    "\n",
    "final_data = laod_data_from_dir(image_dir)\n",
    "\n",
    "\n",
    "with open(\"data/jackyhate/text-to-image-2M/data_1024_10K.json\", \"w\") as f:\n",
    "    for i in final_data:\n",
    "        f.write(json.dumps(i, ensure_ascii=False) + \"\\n\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "image_dir = \"data/jackyhate/unzip2mdata\"\n",
    "\n",
    "\n",
    "def laod_data_from_dir(dir: str):\n",
    "    image_text_dir = Path(dir)\n",
    "    # image_file_list = list(image_text_dir.glob(\"*.jpg\"))\n",
    "    text_file_list = list(image_text_dir.glob(\"*.json\"))\n",
    "\n",
    "    def load_text(file_path: Path) -> Dict[str, Any]:\n",
    "        with open(file_path, \"r\") as f:\n",
    "            return json.load(f).get('prompt')\n",
    "        \n",
    "        \n",
    "    text_data_list = [{'prompt': load_text(file_path), 'file_path':file_path.name.replace(\".json\", \"\")} for file_path in tqdm(text_file_list)]\n",
    "    return text_data_list\n",
    "    \n",
    "\n",
    "final_data = laod_data_from_dir(image_dir)\n",
    "\n",
    "\n",
    "with open(\"data/jackyhate/text-to-image-2M/data_1024_2m.json\", \"w\") as f:\n",
    "    for i in final_data:\n",
    "        f.write(json.dumps(i, ensure_ascii=False) + \"\\n\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{\"prompt\": \"A woman with long hair is sitting in front of a computer monitor displaying a music editing software interface. She is wearing headphones and is playing a keyboard with her right hand. The keyboard is connected to a computer with a mouse and a keyboard visible on the desk. The woman is smiling and appears to be engaged in creating music.\", \"file_path\": \"flux_512_100k_00045366\"}\n",
      "{\"prompt\": \"A mannequin is wearing a brown leather vest with a red collar and a red pocket on the left side.\", \"file_path\": \"flux_512_100k_00002392\"}\n"
     ]
    }
   ],
   "source": [
    "!head -n 2 data/jackyhate/text-to-image-2M/data_1024_2m.json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
